import jieba
import pandas as pd

from utils.config import Config

config = Config('E:/Python+AI/group4_nlp_project')


def cutText(text):
    # 处理NaN值或非字符串类型
    # pd.isna(text) 检查是否为 NaN 值
    # isinstance(text, str) 确保是字符串类型
    if pd.isna(text) or not isinstance(text, str):
        text = ""
    return " ".join(jieba.lcut(text))


def dataEDA(data_path):
    train_data = pd.read_csv(data_path)
    train_data['text'] = train_data['review'].apply(cutText)
    if data_path == config.origin_train:
        train_data.to_csv(config.train_path, index=False)
    elif data_path == config.origin_dev:
        train_data.to_csv(config.dev_path, index=False)
    elif data_path == config.origin_test:
        train_data.to_csv(config.test_path, index=False)


if __name__ == '__main__':
    print(cutText("我今天很开心"))
    # dataEDA(config.origin_train)
    dataEDA(config.origin_test)
    dataEDA(config.origin_dev)
